library(dplyr)
library(RMySQL)
library(tidyr)
library(ggplot2)
library(magrittr)
source('../../shiny_apps/data_loading.R')
Load in Wordbank common data.
wordbank <- connect.to.wordbank("prod")
common.tables <- get.common.tables(wordbank)
admins <- get.administration.data(common.tables$momed,
common.tables$child,
common.tables$instrumentsmap,
common.tables$administration)
items <- get.item.data(common.tables$wordmapping,
common.tables$instrumentsmap,
common.tables$category)
Get by-child productive vocabulary data.
num.words <- items %>%
filter(type == "word") %>%
group_by(language, form) %>%
summarise(n.words = n())
vocab.admins <- admins %>%
select(data_id, language, form, age, sex, comprehension, production) %>%
filter(!is.na(sex), sex != "")
vocab.data <- vocab.admins %>%
group_by(language, form, sex, age) %>%
left_join(num.words) %>%
mutate(comprehension = as.numeric(comprehension)/n.words,
production = as.numeric(production)/n.words) %>%
gather(measure, value, comprehension, production) %>%
filter(form == "WG" | (form == "WS" & measure == "production"))
vocab.sample.sizes <- vocab.admins %>%
group_by(language, form) %>%
summarise(n.admins = length(unique(data_id)))
vocab.data %<>% left_join(vocab.sample.sizes) %>%
filter(n.admins >= 100) %>%
mutate(label = sprintf("%s %s (n = %s)", language, form, n.admins))
Plot WS vocabulary size across age.
ggplot(filter(vocab.data, form == "WS"), aes(x = age, y = value, color = sex)) +
facet_wrap(~ language) +
geom_jitter(size = 0.7) +
scale_color_brewer(palette = "Set1", name = "Sex") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Vocabulary Size\n") +
xlab("\nAge (months)") +
theme_bw(base_size = 12) +
theme(text = element_text(family = font))
Plot WG vocabulary size across age.
ggplot(filter(vocab.data, form == "WG"), aes(x = age, y = value, color = sex)) +
facet_grid(language ~ measure) +
geom_jitter(size = 0.7) +
scale_color_brewer(palette = "Set1", name = "Sex") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Vocabulary Size\n") +
xlab("\nAge (months)") +
theme_bw(base_size = 12) +
theme(text = element_text(family = font))
Plot WS vocabulary size across age with model fit curves.
ggplot(filter(vocab.data, form == "WS"), aes(x = age, y = value, color = sex)) +
facet_wrap(~ language) +
geom_smooth(method = "lm", formula = y ~ I(x^3) + I(x^2) + x) +
scale_color_brewer(palette = "Set1", name = "Sex") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Vocabulary Size\n") +
xlab("\nAge (months)") +
theme_bw(base_size = 12) +
theme(text = element_text(family = font))
Plot WG vocabulary size across age with model fit curves.
ggplot(filter(vocab.data, form == "WG"), aes(x = age, y = value, color = sex)) +
facet_grid(language ~ measure) +
geom_smooth(method = "lm", formula = y ~ I(x^3) + I(x^2) + x) +
scale_color_brewer(palette = "Set1", name = "Sex") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Vocabulary Size\n") +
xlab("\nAge (months)") +
theme_bw(base_size = 12) +
theme(text = element_text(family = font))
Function for resampling data.
sample.diffs <- function(d, num.times) {
get.diff <- function(group_data) {
pts <- seq(min(group_data$age), max(group_data$age), 1)
male <- predict(lm(value ~ I(age^3) + I(age^2) + I(age),
data = filter(group_data, sex == "M")),
newdata = data.frame(age = pts))
female <- predict(lm(value ~ I(age^3) + I(age^2) + I(age),
data = filter(group_data, sex == "F")),
newdata = data.frame(age = pts))
mean(female - male)
}
counter = 1
sample.diff <- function(d) {
d.frame <- d %>%
group_by(language, form, measure) %>%
sample_frac(replace = TRUE) %>% # resample kids
do(diff = get.diff(.)) %>%
mutate(diff = diff[1]) %>%
rename_(.dots = setNames("diff", counter))
counter <<- counter + 1 # increment counter outside scope
return(d.frame)
}
diffs <- replicate(num.times, sample.diff(d), simplify=FALSE)
Reduce(left_join, diffs) %>%
gather(sample, diff, -language, -form, -measure)
}
Resample data, compute difference in vocabulary size between sexes for each sample, find the mean and CI of the difference estimate.
diffs <- sample.diffs(vocab.data, 100)
diffs.summary <- diffs %>%
group_by(language, form, measure) %>%
summarise(mean = mean(diff, na.rm = TRUE),
ci.high = quantile(diff, 0.975, na.rm = TRUE),
ci.low = quantile(diff, 0.025, na.rm = TRUE)) %>%
ungroup() %>%
mutate(language = factor(language))
Plot mean difference by language, form, and measure.
ggplot(diffs.summary, aes(x = mean, y = language)) +
facet_grid(~ form + measure) +
geom_point(color = "steelblue") +
geom_segment(aes(x = ci.low, xend = ci.high,
y = language, yend = language),
color = "steelblue") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
theme_bw() +
theme(text = element_text(family = font)) +
xlab("\nMean gender difference in vocabulary size across age") +
ylab("")